import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df
| id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9046 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
| 1 | 51676 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
| 2 | 31112 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
| 3 | 60182 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
| 4 | 1665 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5105 | 18234 | Female | 80.0 | 1 | 0 | Yes | Private | Urban | 83.75 | NaN | never smoked | 0 |
| 5106 | 44873 | Female | 81.0 | 0 | 0 | Yes | Self-employed | Urban | 125.20 | 40.0 | never smoked | 0 |
| 5107 | 19723 | Female | 35.0 | 0 | 0 | Yes | Self-employed | Rural | 82.99 | 30.6 | never smoked | 0 |
| 5108 | 37544 | Male | 51.0 | 0 | 0 | Yes | Private | Rural | 166.29 | 25.6 | formerly smoked | 0 |
| 5109 | 44679 | Female | 44.0 | 0 | 0 | Yes | Govt_job | Urban | 85.28 | 26.2 | Unknown | 0 |
5110 rows × 12 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5110 non-null int64 1 gender 5110 non-null object 2 age 5110 non-null float64 3 hypertension 5110 non-null int64 4 heart_disease 5110 non-null int64 5 ever_married 5110 non-null object 6 work_type 5110 non-null object 7 Residence_type 5110 non-null object 8 avg_glucose_level 5110 non-null float64 9 bmi 4909 non-null float64 10 smoking_status 5110 non-null object 11 stroke 5110 non-null int64 dtypes: float64(3), int64(4), object(5) memory usage: 479.2+ KB

df.describe().transpose().drop("count",axis=1)
| mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|
| id | 36517.829354 | 21161.721625 | 67.00 | 17741.250 | 36932.000 | 54682.00 | 72940.00 |
| age | 43.226614 | 22.612647 | 0.08 | 25.000 | 45.000 | 61.00 | 82.00 |
| hypertension | 0.097456 | 0.296607 | 0.00 | 0.000 | 0.000 | 0.00 | 1.00 |
| heart_disease | 0.054012 | 0.226063 | 0.00 | 0.000 | 0.000 | 0.00 | 1.00 |
| avg_glucose_level | 106.147677 | 45.283560 | 55.12 | 77.245 | 91.885 | 114.09 | 271.74 |
| bmi | 28.893237 | 7.854067 | 10.30 | 23.500 | 28.100 | 33.10 | 97.60 |
| stroke | 0.048728 | 0.215320 | 0.00 | 0.000 | 0.000 | 0.00 | 1.00 |
# check/drop null values
df.isnull().sum()
id 0 gender 0 age 0 hypertension 0 heart_disease 0 ever_married 0 work_type 0 Residence_type 0 avg_glucose_level 0 bmi 201 smoking_status 0 stroke 0 dtype: int64
import missingno as msno
msno.matrix(df);
# df["bmi"] = df["bmi"].interpolate(method = "linear")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
DT_bmi_pipe = Pipeline( steps=[
('scale',StandardScaler()),
('lr',DecisionTreeRegressor(random_state=42))
])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
y = X.pop('bmi')
DT_bmi_pipe.fit(X,y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi
df.isnull().sum()
id 0 gender 0 age 0 hypertension 0 heart_disease 0 ever_married 0 work_type 0 Residence_type 0 avg_glucose_level 0 bmi 0 smoking_status 0 stroke 0 dtype: int64
# check blank
" " in df.values
False
# check/drop duplicates
df[df.duplicated("id")]
| id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke |
|---|
df = df[df["gender"]!="Other"]
df['bmi_cat'] = pd.cut(df['bmi'], bins = [0, 19, 25,30,10000], labels = ['Underweight', 'Ideal', 'Overweight', 'Obesity'])
df.bmi_cat.value_counts()
Obesity 2011 Overweight 1475 Ideal 1203 Underweight 420 Name: bmi_cat, dtype: int64
## bin the family size.
def bmi_group(value):
"""
This funciton create bmi groups(categories)
"""
# result = ''
if (value <= 19):
result = 'Underweight'
elif (value > 19 and value <= 25):
result = 'Ideal'
elif (value > 25 and value <= 30):
result = 'Overweight'
else:
result = 'Obesity'
return result
df["bmi_cat2"] = df["bmi"].apply(bmi_group)
df.bmi_cat2.value_counts()
Obesity 2011 Overweight 1475 Ideal 1203 Underweight 420 Name: bmi_cat2, dtype: int64
df['bmi_cat3'] = df['bmi_cat2'].map( {'Underweight': 0,
'Ideal': 1,
'Overweight': 2,
'Obesity': 3} ).astype(int)
df
| gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | bmi_cat | bmi_cat2 | bmi_cat3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.600000 | formerly smoked | 1 | Obesity | Obesity | 3 |
| 1 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | 29.879487 | never smoked | 1 | Overweight | Overweight | 2 |
| 2 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.500000 | never smoked | 1 | Obesity | Obesity | 3 |
| 3 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.400000 | smokes | 1 | Obesity | Obesity | 3 |
| 4 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.000000 | never smoked | 1 | Ideal | Ideal | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5105 | Female | 80.0 | 1 | 0 | Yes | Private | Urban | 83.75 | 28.476923 | never smoked | 0 | Overweight | Overweight | 2 |
| 5106 | Female | 81.0 | 0 | 0 | Yes | Self-employed | Urban | 125.20 | 40.000000 | never smoked | 0 | Obesity | Obesity | 3 |
| 5107 | Female | 35.0 | 0 | 0 | Yes | Self-employed | Rural | 82.99 | 30.600000 | never smoked | 0 | Obesity | Obesity | 3 |
| 5108 | Male | 51.0 | 0 | 0 | Yes | Private | Rural | 166.29 | 25.600000 | formerly smoked | 0 | Overweight | Overweight | 2 |
| 5109 | Female | 44.0 | 0 | 0 | Yes | Govt_job | Urban | 85.28 | 26.200000 | Unknown | 0 | Overweight | Overweight | 2 |
5109 rows × 14 columns
stroke_summary = df.groupby("stroke").mean().reset_index()
stroke_summary
| stroke | id | age | hypertension | heart_disease | avg_glucose_level | bmi | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 36483.189300 | 41.974831 | 0.088889 | 0.047119 | 104.787584 | 28.847094 |
| 1 | 1 | 37115.068273 | 67.728193 | 0.265060 | 0.188755 | 132.544739 | 30.336552 |
stroke = df[df["stroke"]==1]
non_stroke = df[df["stroke"]==0]
labels = ["stroke","non_stroke"]
sizes = df.stroke.value_counts().values
plt.pie(x = sizes,labels=labels,explode=[0,0.1],startangle=90,colors = ['skyblue','orange'],autopct="%1.1f%%");
# patches, texts,autotexts = plt.pie(sizes, labels=labels, colors=['skyblue','salmon'], explode=[0,0.1], autopct="%1.1f%%", startangle=90)
df.stroke.value_counts(normalize=True)
0 0.951262 1 0.048738 Name: stroke, dtype: float64
sns.histplot(stroke["age"],stat="density",kde=True,ec="w",label="stroke");
sns.histplot(non_stroke["age"],stat="density",kde=True,color="orange",ec="w",alpha=0.4,label="non_stroke");
plt.legend(loc="best");
sns.kdeplot(data=df, x="bmi", hue="stroke",fill=True, common_norm=False);
sns.histplot(data=df, x="bmi", hue="stroke",bins=np.arange(0,60,2),common_norm=False,stat="density",edgecolor="w");
sns.kdeplot(data=df, x="avg_glucose_level", hue="stroke",fill=True, common_norm=False);
# why valley>
# df['bmi_cat'] = pd.cut(df['bmi'], bins = [0, 19, 25,30,10000], labels = ['Underweight', 'Ideal', 'Overweight', 'Obesity'])
# df['age_cat'] = pd.cut(df['age'], bins = [0,13,18, 45,60,200], labels = ['Children', 'Teens', 'Adults','Mid Adults','Elderly'])
# df['glucose_cat'] = pd.cut(df['avg_glucose_level'], bins = [0,90,160,230,500], labels = ['Low', 'Normal', 'High', 'Very High'])
# 最重要的还是用barplot看比例
sns.barplot(data=df,x="gender",y="stroke")
plt.ylabel("probability of stroke", fontsize = 10);
# 看一下数据中的男女数量分布,其实没什么用,重要的是看男女之间stroke的比例
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
sns.set(rc = {'figure.figsize':(5,4)})
sns.countplot(x='gender',data=df);
# 对于特别imbalanced的data 没什么用
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
sns.set(rc = {'figure.figsize':(5,4)})
sns.countplot(x='stroke',hue='gender',data=df);
plt.subplot(1,2,2)
sns.set(rc = {'figure.figsize':(5,4)})
sns.countplot(x='gender',hue='stroke',data=df);
# 最重要的还是用barplot看比例
plt.figure(figsize=(5,4))
sns.barplot(data=df,x="hypertension",y="stroke")
plt.ylabel("probability of stroke", fontsize = 10);
# 最重要的还是用barplot看比例
plt.figure(figsize=(5,4))
sns.barplot(data=df,x="ever_married",y="stroke")
plt.ylabel("probability of stroke", fontsize = 10);
plt.figure(figsize=(5,4))
sns.barplot(data=df,x="heart_disease",y="stroke")
plt.ylabel("probability of stroke", fontsize = 10);
plt.figure(figsize=(5,4))
sns.barplot(data=df,x="work_type",y="stroke")
plt.xticks(rotation=60);
plt.ylabel("probability of stroke", fontsize = 10);
# 看一下数据中的work type数量分布,其实没什么用,重要的是看男女之间stroke的比例
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
sns.set(rc = {'figure.figsize':(5,4)})
plt.xticks(rotation=60)
sns.countplot(x='work_type',data=df);
plt.figure(figsize=(5,4))
sns.barplot(data=df,x="Residence_type",y="stroke")
plt.xticks(rotation=60);
plt.ylabel("probability of stroke", fontsize = 10);
plt.figure(figsize=(5,4))
sns.barplot(data=df,x="smoking_status",y="stroke")
plt.xticks(rotation=60);
plt.ylabel("probability of stroke", fontsize = 10);
g = sns.FacetGrid(df,col="gender",row="stroke",hue = "stroke", margin_titles=True, size=4)
g = g.map(plt.hist, "age", edgecolor = 'white');
g.fig.suptitle("stroke by Sex and Age", size = 20)
plt.subplots_adjust(top=0.90);
g = sns.FacetGrid(df,col="gender",row="stroke",hue = "stroke", margin_titles=True, size=4)
g = g.map(plt.hist, "avg_glucose_level", edgecolor = 'white');
g.fig.suptitle("stroke by gender and avg_glucose_level", size = 20)
plt.subplots_adjust(top=0.90);
sns.jointplot(data=df,x="age",y="avg_glucose_level",hue="stroke");
g = sns.FacetGrid(data=df,col="gender",hue="stroke",margin_titles=True,size = 4)
g.map(plt.scatter, "avg_glucose_level", "age",edgecolor="w").add_legend()
g.fig.suptitle("Survived by Sex, avg_glucose_level and Age", size = 25)
plt.subplots_adjust(top=0.85)
# The grid above clearly demonstrates the three outliers with Fare of over $500.
# At this point, I think we are quite confident that these outliers should be deleted.
# Most of the passengers were with in the Fare range of $100.
import plotly.express as px
fig = px.parallel_categories(df[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
'work_type', 'Residence_type',
'smoking_status', 'stroke']], color='stroke', color_continuous_scale=px.colors.sequential.Inferno)
fig.show()
df = df.drop("id",axis=1)
X, y = df.drop("stroke",axis=1), df["stroke"]
mask = np.zeros_like(X.join(y).corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set_style('whitegrid')
plt.subplots(figsize = (12,10))
sns.heatmap(X.join(y).corr(),
annot=True, # 方块上显示数字correlation
mask = mask, # 为了弄一半
cmap = 'RdBu', # in order to reverse the bar replace "RdBu" with "RdBu_r"
linewidths=.9, # 方块间留点间隙
linecolor='white',# 间隙的颜色弄白,默认好像就白色
fmt='.2g', # 可以不加
center = 0, # 最好加上,如果不加这个,就得用 vmin=-1, vmax=1,
square=True);
# Already remove multicollinearity: SibSp and Parch
# get dummies
X = pd.get_dummies(X,drop_first=True)
X
| age | hypertension | heart_disease | avg_glucose_level | bmi | gender_Male | ever_married_Yes | work_type_Never_worked | work_type_Private | work_type_Self-employed | work_type_children | Residence_type_Urban | smoking_status_formerly smoked | smoking_status_never smoked | smoking_status_smokes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67.0 | 0 | 1 | 228.69 | 36.600000 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| 1 | 61.0 | 0 | 0 | 202.21 | 29.879487 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 80.0 | 0 | 1 | 105.92 | 32.500000 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 49.0 | 0 | 0 | 171.23 | 34.400000 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 4 | 79.0 | 1 | 0 | 174.12 | 24.000000 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5105 | 80.0 | 1 | 0 | 83.75 | 28.476923 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 5106 | 81.0 | 0 | 0 | 125.20 | 40.000000 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 5107 | 35.0 | 0 | 0 | 82.99 | 30.600000 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 5108 | 51.0 | 0 | 0 | 166.29 | 25.600000 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 5109 | 44.0 | 0 | 0 | 85.28 | 26.200000 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5109 rows × 15 columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,test_size=0.2,stratify=y,random_state=42)
X_train
| age | hypertension | heart_disease | avg_glucose_level | bmi | gender_Male | ever_married_Yes | work_type_Never_worked | work_type_Private | work_type_Self-employed | work_type_children | Residence_type_Urban | smoking_status_formerly smoked | smoking_status_never smoked | smoking_status_smokes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 845 | 48.0 | 0 | 0 | 69.21 | 33.1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3745 | 29.0 | 0 | 0 | 84.19 | 21.2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 4184 | 35.0 | 0 | 0 | 119.40 | 22.9 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3410 | 38.0 | 0 | 0 | 108.68 | 32.7 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 284 | 14.0 | 0 | 0 | 82.34 | 31.6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1434 | 45.0 | 0 | 0 | 92.86 | 35.1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| 461 | 16.0 | 0 | 0 | 113.47 | 19.5 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1052 | 61.0 | 0 | 0 | 78.65 | 36.2 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1757 | 31.0 | 0 | 0 | 74.05 | 26.0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 5053 | 46.0 | 0 | 0 | 55.84 | 27.8 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
4087 rows × 15 columns
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_std = scaler.fit_transform(X_train)
# X_test_std = scaler.transform(X_test)
# X_train_std_df = pd.DataFrame(data=X_train_std, columns=X_train.columns)
# X_test_std_df = pd.DataFrame(data=X_test_std, columns=X_test.columns)
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))
from imblearn.over_sampling import SMOTE
oversample = SMOTE("minority")
X_train,y_train = oversample.fit_resample(X_train,y_train)
print("After OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train==0)))
Before OverSampling, counts of label '1': 199 Before OverSampling, counts of label '0': 3888 After OverSampling, counts of label '1': 3888 After OverSampling, counts of label '0': 3888
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
0.9129158512720157
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.95 0.95 0.95 972
1 0.12 0.12 0.12 50
accuracy 0.91 1022
macro avg 0.54 0.54 0.54 1022
weighted avg 0.91 0.91 0.91 1022
from sklearn import metrics
from sklearn.metrics import roc_auc_score
y_pred_proba = rf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,y_pred_proba)
auc = metrics.roc_auc_score(y_test,y_pred_proba)
plt.plot(fpr,tpr,label = "auc="+str(auc))
plt.legend(loc=4)
plt.title("ROC curve")
plt.show()
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear',penalty= 'l1',random_state = 42)
## fit the model with "train_x" and "train_y"
logreg.fit(X_train,y_train)
## Once the model is trained we want to find out how well the model is performing, so we test the model.
## we use "X_test" portion of the data(this data was not used to fit the model) to predict model outcome.
y_pred = logreg.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.97 0.84 0.90 972
1 0.16 0.58 0.25 50
accuracy 0.83 1022
macro avg 0.57 0.71 0.58 1022
weighted avg 0.94 0.83 0.87 1022
from sklearn import metrics
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,y_pred_proba)
auc = metrics.roc_auc_score(y_test,y_pred_proba)
plt.plot(fpr,tpr,label = "auc="+str(auc))
plt.legend(loc=4)
plt.title("ROC curve")
plt.show()
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt = 'd', cmap = 'Blues', annot_kws = {'size': 16})
plt.xlabel('Predicted')
plt.ylabel('Actual');
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted')
0.871978719819598
f1_score(y_test, y_pred)
0.25
f1_score(y_test, y_pred, average='macro')
0.5769867549668874
X_train
| age | hypertension | heart_disease | avg_glucose_level | bmi | gender_Male | ever_married_Yes | work_type_Never_worked | work_type_Private | work_type_Self-employed | work_type_children | Residence_type_Urban | smoking_status_formerly smoked | smoking_status_never smoked | smoking_status_smokes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 48.000000 | 0 | 0 | 69.210000 | 33.100000 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 1 | 29.000000 | 0 | 0 | 84.190000 | 21.200000 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2 | 35.000000 | 0 | 0 | 119.400000 | 22.900000 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 38.000000 | 0 | 0 | 108.680000 | 32.700000 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 14.000000 | 0 | 0 | 82.340000 | 31.600000 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7771 | 59.333464 | 0 | 0 | 112.610615 | 32.800313 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7772 | 57.000000 | 0 | 0 | 85.600565 | 34.309833 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7773 | 57.608184 | 0 | 0 | 218.815632 | 35.675792 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7774 | 66.159999 | 0 | 0 | 228.662700 | 34.667998 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7775 | 74.462535 | 0 | 0 | 198.493234 | 26.035662 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7776 rows × 15 columns
# feature_names = [i for i in X_train_std_df.columns if X_train_std_df[i].dtype in [np.float64]]
# target_names = "stroke"
# from sklearn import tree
# import graphviz
# from sklearn.tree import export_graphviz
# dot_data = export_graphviz(rf.estimators_[0],
# feature_names= feature_names,
# class_names= target_names,
# filled=True, impurity=True,
# rounded=True,out_file=None)
# graph = graphviz.Source(dot_data, format='png')
# graph
from sklearn.inspection import PartialDependenceDisplay
PartialDependenceDisplay.from_estimator(rf, X_train,['age'],kind='average');
PartialDependenceDisplay.from_estimator(rf, X_train,['hypertension'],kind='average');
import shap
explainer = shap.TreeExplainer(rf)
# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(X_train)
# The shap_values object above is a list with two arrays. the second array is the list of SHAP values for the positive outcome.
# We typically think about predictions in terms of the prediction of a positive outcome
shap.summary_plot(shap_values[1], X_train,alpha=0.4)
# The summary plot combines feature importance with feature effects.
# Each point on the summary plot is a Shapley value for a feature and an instance.
# The position on the y-axis is determined by the feature and on the x-axis by the Shapley value.
# Feature importance: (Vertical location) Variables are ranked in descending order.
# Impact: (Horizontal location) shows whether the effect of that value is associated with a higher or lower prediction.
# The SHAP Dependence plot shows the marginal effect one or two features have on the predicted outcome of a machine learning model
# It tells whether the relationship between the target and a feature is linear, monotonic or more complex
shap.dependence_plot('age', shap_values[1], X_train)
display(X_train.loc[[4082]])
choosen_instance = X_train.loc[[4082]]
shap_values = explainer.shap_values(choosen_instance)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], choosen_instance)
# Feature values in pink cause to increase the prediction. Feature values in blue cause to decrease the prediction
# Size of the bar shows the magnitude of the feature's effect.
| age | hypertension | heart_disease | avg_glucose_level | bmi | gender_Male | ever_married_Yes | work_type_Never_worked | work_type_Private | work_type_Self-employed | work_type_children | Residence_type_Urban | smoking_status_formerly smoked | smoking_status_never smoked | smoking_status_smokes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4082 | 45.0 | 0 | 0 | 92.86 | 35.1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |